﻿using System.Collections.Generic;
using System.Linq;

namespace KnifeZ.ClassLib.LuceneNP
{
    /// <summary>
    /// 功能描述    ：DocExtractor  
    /// 创 建 者    ：KnifeZ
    /// 创建日期    ：2019/2/13 14:39:11 
    /// 最后修改者  ：KnifeZ
    /// 最后修改日期：2019/2/13 14:39:11 
    /// </summary>
    public class DocExtractor
    {
        /// <summary>
        /// 提取关键词
        /// </summary>
        /// <param name="keyWord"></param>
        /// <param name="separator"></param>
        /// <returns></returns>
        public static Dictionary<string, long> GetKeys(string keyWord, string separator = ",")
        {
            int allength = keyWord.Length;
            var dic = new Dictionary<string, long>();
            PanGu.Segment.Init();
            PanGu.Segment segment = new PanGu.Segment();
            ICollection<PanGu.WordInfo> words = segment.DoSegment(keyWord);
            keyWord = "";
            double f = 0.0;//权重
            int k = 0;//词频
            //separator = "\n";
            List<PanGu.WordInfo> d_list = new List<PanGu.WordInfo>();
            words = words.OrderBy(s => s.Word).ThenBy(s => s.Frequency).ThenBy(s => s.Rank).ToList();
            var lastWord = "";
            foreach (PanGu.WordInfo wordInfo in words)
            {
                //词频大于0且长度大于1的地名、动词、名词、形容词POS_D_A
                if (wordInfo.Frequency > 0 && wordInfo.Word.Length > 1 && "POS_A_NS,POS_D_V,POS_D_N".IndexOf(wordInfo.Pos.ToString()) > -1)
                {
                    //二次判断词是否相同
                    if (wordInfo.Word != lastWord)
                    {
                        //上个词非空
                        if (lastWord != "")
                        {
                            if (k > 1)
                            {
                                //内容过短
                                if (allength < 1000)
                                {
                                    f = (lastWord.Length * k / 1000);
                                }
                                else
                                {
                                    f = ((lastWord.Length * k * 1000) / allength);
                                }
                                var tmp = new PanGu.WordInfo()
                                {
                                    Word = lastWord,
                                    Frequency = f,
                                    Rank = k
                                };
                                d_list.Add(tmp);
                            }
                        }
                        k = 0;
                        lastWord = wordInfo.Word;

                        //v_list = wordInfo.Word;
                        //+ "^" + wordInfo.Rank + "^" + wordInfo.Frequency + "^" + wordInfo.Pos;
                        //词+"^"+权重+"^"+词频+"^"+"^"+词性;
                        //if (i == 0) keyWord = v_list;
                        //else keyWord += separator + v_list;
                    }
                    else
                    {
                        k++;
                    }
                }
            }
            d_list = d_list.OrderByDescending(s => s.Frequency).ToList();
            f = 0.0;
            for (int i = 0; i < d_list.Count; i++)
            {
                if (i <= 4)
                {
                    keyWord += separator + d_list[i].Word;
                    f += d_list[i].Frequency;
                }
            }
            dic.Add(keyWord, (long)f);
            return dic;
        }

        public static Dictionary<string,long> GetPrimaryKeys(string content)
        {
            int allength = content.Length;
            var dic = new Dictionary<string, long>();
            PanGu.Segment.Init();
            PanGu.Segment segment = new PanGu.Segment();
            ICollection<PanGu.WordInfo> words = segment.DoSegment(content);
            content = "";
            int k = 0;//词频
            //separator = "\n";
            List<PanGu.WordInfo> d_list = new List<PanGu.WordInfo>();
            words = words.OrderBy(s => s.Word).ToList();
            var lastWord = "";
            foreach (var item in words)
            {
                //二次判断词是否相同
                if (item.Word != lastWord)
                {
                    //上个词非空
                    if (lastWord != "")
                    {
                        k++;
                        dic.Add(lastWord, k);
                    }
                    k = 0;
                    lastWord = item.Word;
                }
                else
                {
                    k++;
                }
            }
            return dic;
        }
    }
}
